{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fbeff717",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ch02-3 Pandas Memory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1e62c334",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Libraries\n",
    "import numpy as np \n",
    "import pandas as pd "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "dbf171c1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 753040 entries, 0 to 753039\n",
      "Data columns (total 35 columns):\n",
      " #   Column        Non-Null Count   Dtype  \n",
      "---  ------        --------------   -----  \n",
      " 0   VAERS_ID      753040 non-null  int64  \n",
      " 1   RECVDATE      753040 non-null  object \n",
      " 2   STATE         637152 non-null  object \n",
      " 3   AGE_YRS       671951 non-null  float64\n",
      " 4   CAGE_YR       604330 non-null  float64\n",
      " 5   CAGE_MO       4304 non-null    float64\n",
      " 6   SEX           753040 non-null  object \n",
      " 7   RPT_DATE      928 non-null     object \n",
      " 8   SYMPTOM_TEXT  752463 non-null  object \n",
      " 9   DIED          10534 non-null   object \n",
      " 10  DATEDIED      9366 non-null    object \n",
      " 11  L_THREAT      11113 non-null   object \n",
      " 12  ER_VISIT      127 non-null     object \n",
      " 13  HOSPITAL      47457 non-null   object \n",
      " 14  HOSPDAYS      31192 non-null   float64\n",
      " 15  X_STAY        378 non-null     object \n",
      " 16  DISABLE       11970 non-null   object \n",
      " 17  RECOVD        679613 non-null  object \n",
      " 18  VAX_DATE      697225 non-null  object \n",
      " 19  ONSET_DATE    687138 non-null  object \n",
      " 20  NUMDAYS       658446 non-null  float64\n",
      " 21  LAB_DATA      229185 non-null  object \n",
      " 22  V_ADMINBY     753040 non-null  object \n",
      " 23  V_FUNDBY      997 non-null     object \n",
      " 24  OTHER_MEDS    372108 non-null  object \n",
      " 25  CUR_ILL       208513 non-null  object \n",
      " 26  HISTORY       374854 non-null  object \n",
      " 27  PRIOR_VAX     36452 non-null   object \n",
      " 28  SPLTTYPE      219962 non-null  object \n",
      " 29  FORM_VERS     753040 non-null  int64  \n",
      " 30  TODAYS_DATE   747480 non-null  object \n",
      " 31  BIRTH_DEFECT  459 non-null     object \n",
      " 32  OFC_VISIT     144517 non-null  object \n",
      " 33  ER_ED_VISIT   90288 non-null   object \n",
      " 34  ALLERGIES     298538 non-null  object \n",
      "dtypes: float64(5), int64(2), object(28)\n",
      "memory usage: 1.4 GB\n"
     ]
    }
   ],
   "source": [
    "# Load data\n",
    "vdata = pd.read_csv(\"data/2021VAERSDATA.csv.gz\", encoding=\"iso-8859-1\", low_memory=False) \n",
    "vdata.info(memory_usage=\"deep\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5258bc2e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "VAERS_ID int64 5\n",
      "RECVDATE object 48\n",
      "STATE object 39\n",
      "AGE_YRS float64 5\n",
      "CAGE_YR float64 5\n",
      "CAGE_MO float64 5\n",
      "SEX object 41\n",
      "RPT_DATE object 23\n",
      "SYMPTOM_TEXT object 496\n",
      "DIED object 23\n",
      "DATEDIED object 23\n",
      "L_THREAT object 23\n",
      "ER_VISIT object 22\n",
      "HOSPITAL object 24\n",
      "HOSPDAYS float64 5\n",
      "X_STAY object 22\n",
      "DISABLE object 23\n",
      "RECOVD object 39\n",
      "VAX_DATE object 46\n",
      "ONSET_DATE object 45\n",
      "NUMDAYS float64 5\n",
      "LAB_DATA object 53\n",
      "V_ADMINBY object 43\n",
      "V_FUNDBY object 23\n",
      "OTHER_MEDS object 50\n",
      "CUR_ILL object 33\n",
      "HISTORY object 50\n",
      "PRIOR_VAX object 25\n",
      "SPLTTYPE object 32\n",
      "FORM_VERS int64 5\n",
      "TODAYS_DATE object 47\n",
      "BIRTH_DEFECT object 22\n",
      "OFC_VISIT object 26\n",
      "ER_ED_VISIT object 25\n",
      "ALLERGIES object 36\n"
     ]
    }
   ],
   "source": [
    "# Inspect the size of each column\n",
    "for name in vdata.columns:\n",
    "    col_bytes = vdata[name].memory_usage(index=False, deep=True) \n",
    "    col_type = vdata[name].dtype\n",
    "    print(name, col_type, col_bytes // (1024 ** 2)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "75883fe5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "753040"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Review the Died column\n",
    "vdata.DIED.memory_usage(index=False, deep=True) \n",
    "vdata.DIED.fillna(False).astype(bool).memory_usage(index=False, deep=True) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "551e9be4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "753040"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# State column\n",
    "vdata[\"STATE\"] = vdata.STATE.str.upper() \n",
    "states = list(vdata[\"STATE\"].unique()) \n",
    "vdata[\"encoded_state\"] = vdata.STATE.apply(lambda state: states.index(state)) \n",
    "vdata[\"encoded_state\"] = vdata[\"encoded_state\"].astype(np.uint8) \n",
    "vdata[\"STATE\"].memory_usage(index=False, deep=True) \n",
    "vdata[\"encoded_state\"].memory_usage(index=False, deep=True) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "26d93734",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply optimizations while loading the data\n",
    "states = list(pd.read_csv(\"vdata_sample.csv.gz\",\n",
    "    converters={\"STATE\": lambda state: state.upper()}, \n",
    "    usecols=[\"STATE\"] \n",
    ")[\"STATE\"].unique()) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bd132f8b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/53/kmyyy3057lndfb0bpwx_2pkr0000gn/T/ipykernel_44258/2124166299.py:2: DtypeWarning: Columns (12) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  vdata = pd.read_csv(\"vdata_sample.csv.gz\", index_col=\"VAERS_ID\",\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 677736 entries, 1184807 to 1034236\n",
      "Data columns (total 33 columns):\n",
      " #   Column        Non-Null Count   Dtype  \n",
      "---  ------        --------------   -----  \n",
      " 0   RECVDATE      677736 non-null  object \n",
      " 1   STATE         677736 non-null  uint8  \n",
      " 2   AGE_YRS       604833 non-null  float64\n",
      " 3   CAGE_YR       543768 non-null  float64\n",
      " 4   CAGE_MO       3875 non-null    float64\n",
      " 5   SEX           677736 non-null  object \n",
      " 6   RPT_DATE      824 non-null     object \n",
      " 7   DIED          677736 non-null  bool   \n",
      " 8   DATEDIED      8404 non-null    object \n",
      " 9   L_THREAT      10032 non-null   object \n",
      " 10  ER_VISIT      114 non-null     object \n",
      " 11  HOSPITAL      42625 non-null   object \n",
      " 12  HOSPDAYS      28022 non-null   float64\n",
      " 13  X_STAY        339 non-null     object \n",
      " 14  DISABLE       10786 non-null   object \n",
      " 15  RECOVD        611676 non-null  object \n",
      " 16  VAX_DATE      627531 non-null  object \n",
      " 17  ONSET_DATE    618403 non-null  object \n",
      " 18  NUMDAYS       592548 non-null  float64\n",
      " 19  LAB_DATA      206155 non-null  object \n",
      " 20  V_ADMINBY     677736 non-null  object \n",
      " 21  V_FUNDBY      887 non-null     object \n",
      " 22  OTHER_MEDS    334856 non-null  object \n",
      " 23  CUR_ILL       187582 non-null  object \n",
      " 24  HISTORY       337417 non-null  object \n",
      " 25  PRIOR_VAX     32773 non-null   object \n",
      " 26  SPLTTYPE      198061 non-null  object \n",
      " 27  FORM_VERS     677736 non-null  int64  \n",
      " 28  TODAYS_DATE   672749 non-null  object \n",
      " 29  BIRTH_DEFECT  420 non-null     object \n",
      " 30  OFC_VISIT     130105 non-null  object \n",
      " 31  ER_ED_VISIT   81193 non-null   object \n",
      " 32  ALLERGIES     268586 non-null  object \n",
      "dtypes: bool(1), float64(5), int64(1), object(25), uint8(1)\n",
      "memory usage: 808.5 MB\n"
     ]
    }
   ],
   "source": [
    "# Skip the symptom_text column\n",
    "vdata = pd.read_csv(\"vdata_sample.csv.gz\", index_col=\"VAERS_ID\",\n",
    "    converters={\n",
    "        \"DIED\": lambda died: died == \"Y\", \"STATE\": lambda state: states.index(state.upper())\n",
    "    }, usecols=lambda name: name != \"SYMPTOM_TEXT\")\n",
    "vdata[\"STATE\"] = vdata[\"STATE\"].astype(np.uint8)\n",
    "vdata.info(memory_usage=\"deep\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6e2cac3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# End of Notebook #"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
